Covid-19 Analysis and Visualization using Plotly Express & Matplotlib¶

Stepwise Implementation¶

Importing Necessary Libraries¶

In [126]:
# Data analysis and Manipulation
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# Importing Plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)

# Initializing Plotly
pio.renderers.default = 'colab'

Importing the Datasets¶

In [128]:
#importing df1
df1 = pd.read_csv(r"C:\Users\biswa\Downloads\covid.csv")
df1.head(7) # returns first 7 rows
Out[128]:
Country/Region Continent Population TotalCases NewCases TotalDeaths NewDeaths TotalRecovered NewRecovered ActiveCases Serious,Critical Tot Cases/1M pop Deaths/1M pop TotalTests Tests/1M pop WHO Region iso_alpha
0 USA North America 3.311981e+08 5032179 NaN 162804.0 NaN 2576668.0 NaN 2292707.0 18296.0 15194.0 492.0 63139605.0 190640.0 Americas USA
1 Brazil South America 2.127107e+08 2917562 NaN 98644.0 NaN 2047660.0 NaN 771258.0 8318.0 13716.0 464.0 13206188.0 62085.0 Americas BRA
2 India Asia 1.381345e+09 2025409 NaN 41638.0 NaN 1377384.0 NaN 606387.0 8944.0 1466.0 30.0 22149351.0 16035.0 South-EastAsia IND
3 Russia Europe 1.459409e+08 871894 NaN 14606.0 NaN 676357.0 NaN 180931.0 2300.0 5974.0 100.0 29716907.0 203623.0 Europe RUS
4 South Africa Africa 5.938157e+07 538184 NaN 9604.0 NaN 387316.0 NaN 141264.0 539.0 9063.0 162.0 3149807.0 53044.0 Africa ZAF
5 Mexico North America 1.290662e+08 462690 6590.0 50517.0 819.0 308848.0 4140.0 103325.0 3987.0 3585.0 391.0 1056915.0 8189.0 Americas MEX
6 Peru South America 3.301632e+07 455409 NaN 20424.0 NaN 310337.0 NaN 124648.0 1426.0 13793.0 619.0 2493429.0 75521.0 Americas PER
In [129]:
#Returns tuple of shape(Rows, Columns)
print(df1.shape)

#Returns size of df1
print (df1.size)
(209, 17)
3553
In [130]:
# Information about df1
# return concise summary of dataframe
df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country/Region    209 non-null    object 
 1   Continent         208 non-null    object 
 2   Population        208 non-null    float64
 3   TotalCases        209 non-null    int64  
 4   NewCases          4 non-null      float64
 5   TotalDeaths       188 non-null    float64
 6   NewDeaths         3 non-null      float64
 7   TotalRecovered    205 non-null    float64
 8   NewRecovered      3 non-null      float64
 9   ActiveCases       205 non-null    float64
 10  Serious,Critical  122 non-null    float64
 11  Tot Cases/1M pop  208 non-null    float64
 12  Deaths/1M pop     187 non-null    float64
 13  TotalTests        191 non-null    float64
 14  Tests/1M pop      191 non-null    float64
 15  WHO Region        184 non-null    object 
 16  iso_alpha         209 non-null    object 
dtypes: float64(12), int64(1), object(4)
memory usage: 27.9+ KB
In [131]:
#importing df2
df2 = pd.read_csv(r"C:\Users\biswa\Downloads\covid_grouped.csv")
df2.head(7)  # returns first 7 rows
Out[131]:
Date Country/Region Confirmed Deaths Recovered Active New cases New deaths New recovered WHO Region iso_alpha
0 2020-01-22 Afghanistan 0 0 0 0 0 0 0 Eastern Mediterranean AFG
1 2020-01-22 Albania 0 0 0 0 0 0 0 Europe ALB
2 2020-01-22 Algeria 0 0 0 0 0 0 0 Africa DZA
3 2020-01-22 Andorra 0 0 0 0 0 0 0 Europe AND
4 2020-01-22 Angola 0 0 0 0 0 0 0 Africa AGO
5 2020-01-22 Antigua and Barbuda 0 0 0 0 0 0 0 Americas ATG
6 2020-01-22 Argentina 0 0 0 0 0 0 0 Americas ARG
In [132]:
#Returns tuple of shape(Rows, Columns)
print (df2.shape)

#Returns size of df2
print (df2.size)
(35156, 11)
386716
In [133]:
# Information about df2
# return concise summary of dataframe

df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35156 entries, 0 to 35155
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            35156 non-null  object
 1   Country/Region  35156 non-null  object
 2   Confirmed       35156 non-null  int64 
 3   Deaths          35156 non-null  int64 
 4   Recovered       35156 non-null  int64 
 5   Active          35156 non-null  int64 
 6   New cases       35156 non-null  int64 
 7   New deaths      35156 non-null  int64 
 8   New recovered   35156 non-null  int64 
 9   WHO Region      35156 non-null  object
 10  iso_alpha       35156 non-null  object
dtypes: int64(7), object(4)
memory usage: 3.0+ MB
In [134]:
#importing df3
df3 = pd.read_csv(r"C:\Users\biswa\Downloads\coviddeath.csv")
df3.head(7) # returns first 7 rows
Out[134]:
Data as of Start Week End Week State Condition Group Condition ICD10_codes Age Group Number of COVID-19 Deaths Flag
0 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 0-24 122.0 NaN
1 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 25-34 596.0 NaN
2 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 35-44 1521.0 NaN
3 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 45-54 4186.0 NaN
4 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 55-64 10014.0 NaN
5 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 65-74 16301.0 NaN
6 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 75-84 19091.0 NaN
In [135]:
#Returns tuple of shape(Rows, Columns)
print(df3.shape)

#Returns size of df3
print(df3.size)
(12260, 10)
122600
In [136]:
# Information about df3
# return concise summary of dataframe


df3.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12260 entries, 0 to 12259
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Data as of                 12260 non-null  object 
 1   Start Week                 12260 non-null  object 
 2   End Week                   12260 non-null  object 
 3   State                      12260 non-null  object 
 4   Condition Group            12260 non-null  object 
 5   Condition                  12260 non-null  object 
 6   ICD10_codes                12260 non-null  object 
 7   Age Group                  12260 non-null  object 
 8   Number of COVID-19 Deaths  5354 non-null   float64
 9   Flag                       6906 non-null   object 
dtypes: float64(1), object(9)
memory usage: 957.9+ KB

Dataset cleaning¶

In [138]:
# Columns labels of a df1

df1.columns
Out[138]:
Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region', 'iso_alpha'],
      dtype='object')
In [139]:
# Drop NewCases, NewDeaths, NewRecovered rows from dataset1

df1.drop(['NewCases', 'NewDeaths', 'NewRecovered'],  axis=1, errors = 'ignore')

# Select random set of values from dataset1
df1.sample(5)
Out[139]:
Country/Region Continent Population TotalCases NewCases TotalDeaths NewDeaths TotalRecovered NewRecovered ActiveCases Serious,Critical Tot Cases/1M pop Deaths/1M pop TotalTests Tests/1M pop WHO Region iso_alpha
152 Botswana Africa 2356075.0 804 NaN 2.0 NaN 63.0 NaN 739.0 1.0 341.0 0.8 68423.0 29041.0 Africa BWA
44 Poland Europe 37842302.0 49515 NaN 1774.0 NaN 35642.0 NaN 12099.0 72.0 1308.0 47.0 2374686.0 62752.0 Europe POL
138 Liberia Africa 5068618.0 1224 NaN 78.0 NaN 705.0 NaN 441.0 NaN 241.0 15.0 NaN NaN Africa LBR
207 Vatican City Europe 801.0 12 NaN NaN NaN 12.0 NaN 0.0 NaN 14981.0 NaN NaN NaN Europe VAT
3 Russia Europe 145940924.0 871894 NaN 14606.0 NaN 676357.0 NaN 180931.0 2300.0 5974.0 100.0 29716907.0 203623.0 Europe RUS

Creating table using plotly express¶

In [141]:
# Import create_table Figure Factory

from plotly.figure_factory import create_table

colorscale = [[0, '#4d004c'], [.5, '#f2e5ff'], [1, '#ffffff']]
df1_table = create_table(df1.head(15), colorscale=colorscale)

py.iplot(df1_table)

Bar graphs- Comparisons between COVID infected countries in terms of total cases, total deaths, total recovered & total tests¶

Primarily look at the country with respect to a total number of cases by top 15 countries only and color total cases and hover data as ‘Country/Region’, ‘Continent’.¶

Example : Bar Graph¶

In [145]:
px.bar(df1.head(15), x = 'Country/Region', y = 'TotalCases', color = 'TotalCases', height = 400, hover_data = ['Country/Region', 'Continent'])

Example : Bar Graph¶

In [147]:
px.bar(df1.head(15), x = 'Country/Region', y = 'TotalCases', color = 'TotalDeaths', height = 400, hover_data = ['Country/Region', 'Continent'])
In [148]:
px.bar(df1.head(15), x = 'Country/Region', y = 'TotalCases', color = 'TotalRecovered', height = 400, hover_data = ['Country/Region', 'Continent'])
In [149]:
px.bar(df1.head(15), x = 'Country/Region', y = 'TotalCases', color = 'TotalTests', height = 400, hover_data = ['Country/Region', 'Continent'])

create a horizontal orientation plot with X-axis as ‘TotalTests’ and Y-axis as ‘Country/Region’ with passing parameter orientation=”h” and color the plot by ‘TotalTests’.¶

In [151]:
px.bar(df1.head(15), x = 'TotalTests', y = 'Country/Region', color = 'TotalTests',orientation = 'h' , height = 500, hover_data = ['Country/Region', 'Continent'])

Let’s look at ‘TotalTests’ followed by ‘Continent’¶

In [153]:
px.bar(df1.head(15), x = 'TotalTests', y = 'Continent', color = 'TotalTests',orientation = 'h' , height = 300, hover_data = ['Country/Region', 'Continent'])

Data Visualization through Bubble Charts-Continent Wise¶

In [155]:
px.scatter(df1, x = 'Continent', y = 'TotalCases', color = 'TotalCases', hover_data = ['Country/Region', 'Continent'], size = 'TotalCases', size_max= 60, log_y = True)
In [156]:
px.scatter(df1.head(50), x = 'Continent', y = 'TotalTests', color = 'TotalTests', hover_data = ['Country/Region', 'Continent'], size = 'TotalTests', size_max= 60, log_y = True)

Data Visualization through Bubble Charts-Country Wise¶

In [158]:
px.scatter(df1.head(100), x = 'Country/Region', y = 'TotalCases', color = 'TotalCases', size = 'TotalCases', hover_data = ['Country/Region', 'Continent'], size_max = 80)

Now, the Country/Region with respect to the total number of cases for top 30 countries only and color the total number of cases and take the hover data as ‘Country/Region’, ‘Continent’.¶

In [160]:
px.scatter(df1.head(30), x = 'Country/Region', y = 'TotalCases', color = 'Country/Region', size = 'TotalCases', hover_data = ['Country/Region', 'Continent'], size_max = 60, log_y = True)

country/region in relation to the total number of deaths.¶

In [162]:
px.scatter(df1.head(10), x = 'Country/Region', y = 'TotalDeaths', color = 'Country/Region', size = 'TotalDeaths', hover_data = ['Country/Region', 'Continent'], size_max = 40, height = 400)

Country/Region VS Tests/1M pop (color-scale of Tests/1M pop)¶

In [164]:
px.scatter(df1.head(30), x = 'Country/Region', y = 'Tests/1M pop', color = 'Tests/1M pop', size = 'Tests/1M pop', hover_data = ['Country/Region', 'Continent'], height = 400, size_max = 40)

Bar chart¶

In [166]:
df2.head(5)
Out[166]:
Date Country/Region Confirmed Deaths Recovered Active New cases New deaths New recovered WHO Region iso_alpha
0 2020-01-22 Afghanistan 0 0 0 0 0 0 0 Eastern Mediterranean AFG
1 2020-01-22 Albania 0 0 0 0 0 0 0 Europe ALB
2 2020-01-22 Algeria 0 0 0 0 0 0 0 Africa DZA
3 2020-01-22 Andorra 0 0 0 0 0 0 0 Europe AND
4 2020-01-22 Angola 0 0 0 0 0 0 0 Africa AGO

Advanced Data Visualization- Bar graphs for All top infected Countries¶

In this task, we will explore covid-19 data using bar graphs and charts and use dataset2 as it has date column.¶

In [168]:
px.bar(df2, x = 'Date', y = 'Confirmed', color = 'Confirmed', hover_data = ['Confirmed', 'Country/Region', 'Date'], height = 400)

log_y=True → This applies a logarithmic scale to the y-axis. Effect: Helps visualize data with large variations by compressing high values and expanding smaller ones.¶

In [170]:
px.bar(df2, x = 'Date', y = 'Confirmed', color = 'Confirmed', hover_data = ['Confirmed', 'Country/Region', 'Date'], height = 400, log_y = True)

Countries Specific COVID Data Visualization: (United States)¶

Refining dataset to get only USA data¶

In [172]:
df_us = df2.loc[df2['Country/Region'] == 'US']
df_us
Out[172]:
Date Country/Region Confirmed Deaths Recovered Active New cases New deaths New recovered WHO Region iso_alpha
173 2020-01-22 US 1 0 0 1 0 0 0 Americas USA
360 2020-01-23 US 1 0 0 1 0 0 0 Americas USA
547 2020-01-24 US 2 0 0 2 1 0 0 Americas USA
734 2020-01-25 US 2 0 0 2 0 0 0 Americas USA
921 2020-01-26 US 5 0 0 5 3 0 0 Americas USA
... ... ... ... ... ... ... ... ... ... ... ...
34394 2020-07-23 US 4038816 144430 1233269 2661117 68695 1114 22420 Americas USA
34581 2020-07-24 US 4112531 145560 1261624 2705347 73715 1130 28355 Americas USA
34768 2020-07-25 US 4178970 146465 1279414 2753091 66439 905 17790 Americas USA
34955 2020-07-26 US 4233923 146935 1297863 2789125 54953 470 18449 Americas USA
35142 2020-07-27 US 4290259 148011 1325804 2816444 56336 1076 27941 Americas USA

188 rows × 11 columns

In [173]:
px.bar(df_us, x="Date", y="Confirmed", color="Confirmed", height=300)
In [174]:
px.bar(df_us,x="Date", y="Recovered", color="Recovered", height=300)

Visualization of Data in terms of Maps¶

Creating map¶

In [177]:
df2 = pd.read_csv(r"C:\Users\biswa\Downloads\covid_grouped.csv")
df2
df2.columns
Out[177]:
Index(['Date', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Active',
       'New cases', 'New deaths', 'New recovered', 'WHO Region', 'iso_alpha'],
      dtype='object')
In [178]:
px.choropleth(df2,
              locations="iso_alpha",
              color="Confirmed",
              hover_name="Country/Region", 
              color_continuous_scale="Blues",
              animation_frame="Date")
In [179]:
px.choropleth(df2,
    locations = 'iso_alpha',
    color = 'Deaths',
    hover_name = 'Country/Region',
    color_continuous_scale="Viridis",
    animation_frame="Date"
    )

Natural earth projection¶

In [181]:
px.choropleth(df2,
              locations = 'iso_alpha',
              color = 'Recovered',
              hover_name = 'Country/Region',
              color_continuous_scale="RdYlGn",
              projection="natural earth",
              animation_frame="Date" 
              )

Bar graph animation¶

In [183]:
px.bar(df2,
        x = 'WHO Region',
        y = 'Confirmed',
        color = 'WHO Region',
        animation_frame = 'Date',
        hover_name = 'Country/Region'
      )
       

Visualize text using Word Cloud¶

Importing df3¶

In [186]:
df3 = pd.read_csv(r"C:\Users\biswa\Downloads\coviddeath.csv")
df3
Out[186]:
Data as of Start Week End Week State Condition Group Condition ICD10_codes Age Group Number of COVID-19 Deaths Flag
0 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 0-24 122.0 NaN
1 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 25-34 596.0 NaN
2 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 35-44 1521.0 NaN
3 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 45-54 4186.0 NaN
4 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 55-64 10014.0 NaN
... ... ... ... ... ... ... ... ... ... ...
12255 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 65-74 5024.0 NaN
12256 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 75-84 5381.0 NaN
12257 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 85+ 4841.0 NaN
12258 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 Not stated NaN Counts less than 10 suppressed.
12259 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 All ages 20628.0 NaN

12260 rows × 10 columns

Getting dataset information¶

In [188]:
df3.groupby(["Condition"]).count()
Out[188]:
Data as of Start Week End Week State Condition Group ICD10_codes Age Group Number of COVID-19 Deaths Flag
Condition
Adult respiratory distress syndrome 540 540 540 540 540 540 540 272 268
All other conditions and causes (residual) 540 540 540 540 540 540 540 363 177
Alzheimer disease 530 530 530 530 530 530 530 144 386
COVID-19 540 540 540 540 540 540 540 377 163
Cardiac arrest 520 520 520 520 520 520 520 219 301
Cardiac arrhythmia 540 540 540 540 540 540 540 192 348
Cerebrovascular diseases 530 530 530 530 530 530 530 187 343
Chronic lower respiratory diseases 540 540 540 540 540 540 540 229 311
Diabetes 540 540 540 540 540 540 540 276 264
Heart failure 540 540 540 540 540 540 540 204 336
Hypertensive diseases 540 540 540 540 540 540 540 264 276
Influenza and pneumonia 540 540 540 540 540 540 540 331 209
Intentional and unintentional injury, poisoning, and other adverse events 520 520 520 520 520 520 520 188 332
Ischemic heart disease 540 540 540 540 540 540 540 224 316
Malignant neoplasms 540 540 540 540 540 540 540 198 342
Obesity 530 530 530 530 530 530 530 182 348
Other diseases of the circulatory system 530 530 530 530 530 530 530 213 317
Other diseases of the respiratory system 540 540 540 540 540 540 540 188 352
Renal failure 540 540 540 540 540 540 540 238 302
Respiratory arrest 480 480 480 480 480 480 480 111 369
Respiratory failure 540 540 540 540 540 540 540 320 220
Sepsis 530 530 530 530 530 530 530 243 287
Vascular and unspecified dementia 530 530 530 530 530 530 530 191 339

Creating wordcloud¶

In [190]:
# import word cloud
from wordcloud import WordCloud

sentences = df3["Condition"].tolist()
sentences_as_a_string = ' '.join(sentences)


# Convert the string into WordCloud
plt.figure(figsize=(20, 20))
plt.imshow(WordCloud().generate(sentences_as_a_string))
Out[190]:
<matplotlib.image.AxesImage at 0x1919e6f0140>
No description has been provided for this image
In [191]:
column2_tolist= df3["Condition Group"].tolist()

# Convert the list to one single string
column_to_string= " ".join(column2_tolist)

# Convert the string into WordCloud
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(column_to_string))
Out[191]:
<matplotlib.image.AxesImage at 0x1919d74b800>
No description has been provided for this image
In [ ]: